Data visualization of viral contigs prediction by VirSorter, VirSorter2, DeepVirFinder¶

In [1]:
from math import ceil

import os
import glob
from os.path import join
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
In [2]:
# Set seaborn plotting aesthetics as default
sns.set()

Virsorter pre-processing¶

In [3]:
virsorter_list = glob.glob("virsorter/*/VIRSorter_global-phage-signal.csv")
len(virsorter_list)
Out[3]:
123
In [4]:
def read_tsv_file(fp):
    try:
        # Attempt to read the TSV file
        df = pd.read_csv(fp, comment="#", sep=",", header=None)
        # Add a new column with the directory name
        return df
    except pd.errors.EmptyDataError:
        # Determine the number of columns by checking the header of another file
        sample_df = pd.read_csv(virsorter_list[0], comment="#", sep=",", header=None, nrows=1)
        columns = ["Contig_id", "Nb genes contigs", "Fragment", "Nb genes", "Category", "Nb phage hallmark genes", "Phage gene enrichment sig", "Non-Caudovirales phage gene enrichment sig", "Pfam depletion sig", "Uncharacterized enrichment sig", "Strand switch depletion sig", "Short genes enrichment sig"]
        # Create a DataFrame with null values for each column
        empty_df = pd.DataFrame([["null"] * 12])
        return empty_df
In [5]:
df_virsorter = pd.concat([read_tsv_file(fp).assign(New=os.path.basename(os.path.dirname(fp))) for fp in virsorter_list])
df_virsorter.columns = ["Contig_id", "Nb genes contigs", "Fragment", "Nb genes", "Category", "Nb phage hallmark genes", "Phage gene enrichment sig", "Non-Caudovirales phage gene enrichment sig", "Pfam depletion sig", "Uncharacterized enrichment sig", "Strand switch depletion sig", "Short genes enrichment sig", "Sample_name"]
sample_name = df_virsorter.pop("Sample_name")
df_virsorter.insert(0,"Sample_name", sample_name)
df_virsorter['Contig_id'] = df_virsorter['Contig_id'].str.replace('VIRSorter_', '', regex=False)
df_virsorter['Contig_id'] = df_virsorter['Contig_id'].str.replace('-circular', '', regex=False)

df_virsorter['length'] = df_virsorter['Contig_id'].str.split('_').str[3]
df_virsorter['length'] = df_virsorter['length'].str.replace(' ', '', regex=False)
df_virsorter['length'] = df_virsorter['length'].astype(float)
df_virsorter = df_virsorter.loc[df_virsorter['length'] >= 1000 ]
df_virsorter
Out[5]:
Sample_name Contig_id Nb genes contigs Fragment Nb genes Category Nb phage hallmark genes Phage gene enrichment sig Non-Caudovirales phage gene enrichment sig Pfam depletion sig Uncharacterized enrichment sig Strand switch depletion sig Short genes enrichment sig length
0 BER_0XMTAHIFY NODE_11015_length_1722_cov_4_046191 5 VIRSorter_NODE_11015_length_1722_cov_4_046191 5 1 1.0 2.41026382682884 NaN 2.96905455092977 NaN NaN NaN 1722.0
1 BER_0XMTAHIFY NODE_14423_length_1447_cov_3_596983 4 VIRSorter_NODE_14423_length_1447_cov_3_596983 4 1 1.0 3.04440229453496 NaN 2.37524364074382 NaN NaN NaN 1447.0
2 BER_0XMTAHIFY NODE_15705_length_1371_cov_2_724164 3 VIRSorter_NODE_15705_length_1371_cov_2_724164 3 1 1.0 2.28330172090122 NaN NaN NaN NaN NaN 1371.0
3 BER_0XMTAHIFY NODE_2110_length_5594_cov_7_496480 8 VIRSorter_NODE_2110_length_5594_cov_7_496480 8 1 3.0 6.08880458906991 NaN 4.75048728148764 NaN NaN NaN 5594.0
4 BER_0XMTAHIFY NODE_2979_length_4285_cov_4_086525 8 VIRSorter_NODE_2979_length_4285_cov_4_086525 8 1 1.0 2.25950718871954 NaN 3.36313445460326 NaN NaN NaN 4285.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
275 TOK_XV92PXO2D NODE_58_length_84686_cov_7_783460 86 VIRSorter_NODE_58_length_84686_cov_7_783460-ge... 32 3 NaN NaN NaN gene_48-gene_79:7.37854174991945 gene_52-gene_75:4.47459491092940 NaN NaN 84686.0
1 Unknown_YSOUSNDN9 NODE_150_length_1622_cov_3_324825 7 VIRSorter_NODE_150_length_1622_cov_3_324825 7 2 NaN 3.735218 NaN NaN NaN NaN NaN 1622.0
3 Unknown_YSOUSNDN9 NODE_253_length_1439_cov_3_143064 6 VIRSorter_NODE_253_length_1439_cov_3_143064 6 2 NaN 3.201616 NaN NaN NaN NaN NaN 1439.0
5 Unknown_YSOUSNDN9 NODE_374_length_1301_cov_9_247191 4 VIRSorter_NODE_374_length_1301_cov_9_247191 4 2 NaN 2.13441 NaN NaN NaN NaN NaN 1301.0
6 Unknown_YSOUSNDN9 NODE_652_length_1108_cov_21_948718 5 VIRSorter_NODE_652_length_1108_cov_21_948718 5 2 NaN 2.668013 NaN NaN NaN NaN NaN 1108.0

11462 rows × 14 columns

In [6]:
df_virsorter_counted = df_virsorter.groupby(["Sample_name"]).count()
df_virsorter_counted = df_virsorter_counted[["Contig_id"]]
df_virsorter_counted.rename(columns={'Contig_id': 'Contig_id_VirSorter'}, inplace= True)
df_virsorter_counted.reset_index(inplace=True)
df_virsorter.groupby(["Sample_name"]).count()
Out[6]:
Contig_id Nb genes contigs Fragment Nb genes Category Nb phage hallmark genes Phage gene enrichment sig Non-Caudovirales phage gene enrichment sig Pfam depletion sig Uncharacterized enrichment sig Strand switch depletion sig Short genes enrichment sig length
Sample_name
BER_0XMTAHIFY 155 155 155 155 155 50 61 8 143 78 6 37 155
BER_1FZ1M3LBT 1 1 1 1 1 0 1 0 0 0 0 0 1
BER_5UIO53M4H 37 37 37 37 37 7 31 2 17 1 0 0 37
BER_62VRQ43DO 20 20 20 20 20 9 14 2 18 6 1 1 20
BER_6LAJ3M790 2 2 2 2 2 0 2 0 1 0 0 0 2
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TOK_EZO4LULQ4 94 94 94 94 94 12 83 3 58 8 1 10 94
TOK_N2P2DPKIY 85 85 85 85 85 24 59 2 72 13 0 19 85
TOK_SM080U360 64 64 64 64 64 4 64 0 0 0 0 3 64
TOK_XV92PXO2D 171 171 171 171 171 47 106 5 151 36 1 37 171
Unknown_YSOUSNDN9 4 4 4 4 4 0 4 0 0 0 0 0 4

114 rows × 13 columns

VirSorter2¶

In [7]:
virsorter2_list = glob.glob("virsorter2/*/final-viral-score.tsv")
len(virsorter2_list)
Out[7]:
123
In [8]:
df_virsorter2 = pd.concat([pd.read_csv(fp, sep="\t").assign(Sample_name=os.path.basename(os.path.dirname(fp))) for fp in virsorter2_list])
sample_name_v2 = df_virsorter2.pop("Sample_name")
df_virsorter2.insert(0,"Sample_name", sample_name_v2)
df_virsorter2[['seqname', 'type']] = df_virsorter2['seqname'].str.split(r'\|\|', expand=True)
df_virsorter2.rename(columns={'seqname': 'Contig_id'}, inplace= True)
df_virsorter2['Contig_id'] = df_virsorter2['Contig_id'].str.replace('.','_')
#df_virsorter2 = df_virsorter2.loc[df_virsorter2['max_score'] >= 0.9]
df_virsorter2 = df_virsorter2.loc[df_virsorter2['length'] >= 1000]
df_virsorter2 
C:\Users\Olga\AppData\Local\Temp\ipykernel_31648\2783735064.py:1: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
  df_virsorter2 = pd.concat([pd.read_csv(fp, sep="\t").assign(Sample_name=os.path.basename(os.path.dirname(fp))) for fp in virsorter2_list])
Out[8]:
Sample_name Contig_id dsDNAphage NCLDV RNA ssDNA lavidaviridae max_score max_score_group length hallmark viral cellular type
0 BER_0XMTAHIFY NODE_53_length_176986_cov_28_951625 0.853 0.040 0.000 0.000 0.000 0.853 dsDNAphage 176555 6 30.1 16.5 full
1 BER_0XMTAHIFY NODE_106_length_100541_cov_12_450958 0.833 0.007 0.000 0.000 0.000 0.833 dsDNAphage 98010 1 24.6 22.9 full
2 BER_0XMTAHIFY NODE_171_length_60244_cov_8_122348 0.887 0.213 0.000 0.000 0.007 0.887 dsDNAphage 60132 9 28.4 27.0 full
3 BER_0XMTAHIFY NODE_324_length_29490_cov_8_063836 0.967 0.213 0.000 0.000 0.020 0.967 dsDNAphage 29488 1 38.9 2.8 full
4 BER_0XMTAHIFY NODE_370_length_25515_cov_17_079615 0.967 0.020 0.000 0.007 0.013 0.967 dsDNAphage 25513 11 54.5 36.4 full
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
29 Unknown_YSOUSNDN9 NODE_882_length_1018_cov_2_137072 0.440 NaN 0.255 0.960 0.380 0.960 ssDNA 1018 0 33.3 0.0 full
30 Unknown_YSOUSNDN9 NODE_886_length_1016_cov_2_861602 0.333 NaN 0.340 0.093 0.527 0.527 lavidaviridae 1015 0 33.3 0.0 full
31 Unknown_YSOUSNDN9 NODE_163_length_1594_cov_3_375569 NaN NaN NaN NaN NaN NaN dsDNAphage 1594 1 100.0 0.0 lt2gene
32 Unknown_YSOUSNDN9 NODE_212_length_1493_cov_3_253129 NaN NaN NaN NaN NaN NaN dsDNAphage 1493 1 100.0 0.0 lt2gene
33 Unknown_YSOUSNDN9 NODE_900_length_1011_cov_2_092050 NaN NaN NaN NaN NaN NaN dsDNAphage 1011 1 100.0 0.0 lt2gene

87783 rows × 14 columns

In [9]:
df_virsorter2_counted = df_virsorter2.groupby(["Sample_name"]).count()
df_virsorter2_counted = df_virsorter2_counted[["Contig_id"]]
df_virsorter2_counted.rename(columns={'Contig_id': 'Contig_id_VirSorter2'}, inplace= True)
df_virsorter2_counted.reset_index(inplace=True)
df_virsorter2.groupby(["Sample_name"]).count()
Out[9]:
Contig_id dsDNAphage NCLDV RNA ssDNA lavidaviridae max_score max_score_group length hallmark viral cellular type
Sample_name
BER_0XMTAHIFY 1176 1074 863 1074 1074 1074 1096 1176 1176 1176 1176 1176 1176
BER_1FZ1M3LBT 2 2 2 2 2 2 2 2 2 2 2 2 2
BER_5UIO53M4H 487 442 365 442 442 442 450 487 487 487 487 487 487
BER_62VRQ43DO 110 105 103 105 105 105 106 110 110 110 110 110 110
BER_6LAJ3M790 18 17 16 17 17 17 18 18 18 18 18 18 18
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TOK_EZO4LULQ4 772 718 644 718 718 718 728 772 772 772 772 772 772
TOK_N2P2DPKIY 757 674 523 674 674 674 682 757 757 757 757 757 757
TOK_SM080U360 318 287 252 287 287 287 295 318 318 318 318 318 318
TOK_XV92PXO2D 1394 1243 980 1243 1243 1243 1256 1394 1394 1394 1394 1394 1394
Unknown_YSOUSNDN9 33 30 21 30 30 30 30 33 33 33 33 33 33

121 rows × 13 columns

DeepVirFinder¶

In [10]:
deepvirfinder_list = glob.glob("deepvirfinder/*/*dvfpred.txt")
len(deepvirfinder_list)
Out[10]:
123
In [11]:
df_deepvirfinder = pd.concat([pd.read_csv(fp, sep="\t").assign(Sample_name=os.path.basename(os.path.dirname(fp))) for fp in deepvirfinder_list])
sample_name_dvf = df_deepvirfinder.pop("Sample_name")
df_deepvirfinder.insert(0,"Sample_name", sample_name_dvf)
df_deepvirfinder = df_deepvirfinder.rename(columns={'name': 'Contig_id'})
df_deepvirfinder['Contig_id'] = df_deepvirfinder['Contig_id'].str.replace('.','_')
df_deepvirfinder = df_deepvirfinder.loc[df_deepvirfinder['score'] >= 0.9] ## 10% error
df_deepvirfinder = df_deepvirfinder.loc[df_deepvirfinder['len'] >= 1000]
df_deepvirfinder
Out[11]:
Sample_name Contig_id len score pvalue
40 BER_0XMTAHIFY NODE_42_length_194988_cov_8_780940 194988 0.910056 0.017033
65 BER_0XMTAHIFY NODE_2_length_796906_cov_246_742030 796906 0.908315 0.017109
149 BER_0XMTAHIFY NODE_150_length_66695_cov_8_224595 66695 0.949682 0.013672
165 BER_0XMTAHIFY NODE_154_length_66337_cov_8_275369 66337 0.929793 0.015579
166 BER_0XMTAHIFY NODE_170_length_60866_cov_8_550509 60866 0.933900 0.015258
... ... ... ... ... ...
804 Unknown_YSOUSNDN9 NODE_802_length_1049_cov_2_828974 1049 0.999941 0.001246
817 Unknown_YSOUSNDN9 NODE_820_length_1042_cov_2_851064 1042 0.999861 0.001888
854 Unknown_YSOUSNDN9 NODE_848_length_1031_cov_1_960041 1031 0.933558 0.015258
866 Unknown_YSOUSNDN9 NODE_868_length_1025_cov_1_929897 1025 1.000000 0.000000
888 Unknown_YSOUSNDN9 NODE_890_length_1014_cov_2_138686 1014 0.996957 0.005306

70478 rows × 5 columns

In [12]:
deepvirfinder_counted = df_deepvirfinder.groupby(["Sample_name"]).count()
deepvirfinder_counted = deepvirfinder_counted[["Contig_id"]]
deepvirfinder_counted.rename(columns={'Contig_id': 'Contig_id_DVF'}, inplace= True)
deepvirfinder_counted.reset_index(inplace=True)
deepvirfinder_counted 
Out[12]:
Sample_name Contig_id_DVF
0 BER_0XMTAHIFY 679
1 BER_1FZ1M3LBT 13
2 BER_5UIO53M4H 1241
3 BER_62VRQ43DO 142
4 BER_6LAJ3M790 45
... ... ...
118 TOK_EZO4LULQ4 531
119 TOK_N2P2DPKIY 694
120 TOK_SM080U360 361
121 TOK_XV92PXO2D 1069
122 Unknown_YSOUSNDN9 45

123 rows × 2 columns

Contig number comparison¶

Data preparation¶

In [13]:
merged_df_counted = df_virsorter_counted.merge(df_virsorter2_counted, on="Sample_name", how='outer').merge(deepvirfinder_counted, on = "Sample_name", how='outer')
merged_df_counted_vs_only = df_virsorter_counted.merge(df_virsorter2_counted, on="Sample_name")
vs = df_virsorter_counted["Sample_name"].to_list()
vs2 = df_virsorter2_counted["Sample_name"].to_list()
dvf = deepvirfinder_counted["Sample_name"].to_list()
In [14]:
merged_df_counted_melted = merged_df_counted.melt(id_vars=["Sample_name"], value_vars=['Contig_id_VirSorter','Contig_id_VirSorter2','Contig_id_DVF'], var_name="Tool", value_name="Nb. of contigs")
merged_df_counted_melted = merged_df_counted_melted.sort_values('Nb. of contigs', ascending=False)
merged_df_counted_melted_vs_only = merged_df_counted.melt(id_vars=["Sample_name"], value_vars=['Contig_id_VirSorter','Contig_id_VirSorter2'], var_name="Tool", value_name="Nb. of contigs")
merged_df_counted_melted_vs_only = merged_df_counted_melted_vs_only.sort_values('Nb. of contigs', ascending=False)

Data visualization¶

In [15]:
def grouped_bar_plot(df, a):
    # Customize here: If there are multiple samples, add `order = samples_ordered` to the barplot command
    p = sns.barplot(data = df, x = 'Sample_name', y = "Nb. of contigs", hue = 'Tool', \
                    palette = "colorblind", ax = a) 
    p.tick_params(axis='x', labelrotation=90, size = 5)
    p.set_ylabel('Number of Contigs', size = 40)
    p.set_xlabel('Samples Names', size = 40)
    p.get_legend().remove()
    return p
In [16]:
%%capture

fig, axs = plt.subplots(1, 1, figsize = (28,12)) # in inches
plt.xticks(size = 13)
plt.yticks(size = 25)

axs.set_title("Number of Viral contigs identified  by each tool", size = 60, pad=30)
In [17]:
grouped_bar_plot(merged_df_counted_melted, axs)
Out[17]:
<Axes: title={'center': 'Number of Viral contigs identified  by each tool'}, xlabel='Samples Names', ylabel='Number of Contigs'>

Compare number of contigs identified as viral by each tool

In [18]:
# Add a global legend referring to all and plot the graph
handles, labels = axs.get_legend_handles_labels()
fig.legend(handles, ["DeepVirFinder","VirSorter2","VirSorter"], loc = 'center right', fontsize = 40)
fig
Out[18]:
No description has been provided for this image
In [19]:
%%capture

fig1, axs1 = plt.subplots(1, 1, figsize = (28,12)) # in inches
plt.xticks(size = 13)
plt.yticks(size = 25)

axs1.set_title("Number of Viral contigs identified  by VirSorter and VirSorter2", size = 60, pad=30)
In [20]:
grouped_bar_plot(merged_df_counted_melted_vs_only, axs1)
Out[20]:
<Axes: title={'center': 'Number of Viral contigs identified  by VirSorter and VirSorter2'}, xlabel='Samples Names', ylabel='Number of Contigs'>

Do the same comparison but only for VirSOrter and VirSorter2

In [21]:
# Add a global legend referring to all and plot the graph
handles, labels = axs1.get_legend_handles_labels()
fig1.legend(handles, ["VirSorter2","VirSorter"], loc = 'center right', fontsize = 40)
fig1
Out[21]:
No description has been provided for this image

Compare viral contigs identified by each tool¶

In [22]:
import pandas as pd
from matplotlib_venn import venn3, venn3_circles
import matplotlib.pyplot as plt

plot Venn diagram for VirSorter VirSSorter2 and DeepVirFinder

df_virsorter df_virsorter2 df_deepvirfinder

In [23]:
virsorter_contigs = set(df_virsorter["Contig_id"])
virsorter2_contigs = set(df_virsorter2["Contig_id"])
dvf_contigs = set(df_deepvirfinder["Contig_id"])
In [24]:
plt.figure(figsize=(10, 7))
venn3([virsorter_contigs, virsorter2_contigs,dvf_contigs], ('VirSorter', 'VirSorter2', 'DeepVirFinder'), set_colors=("#0173b2", "#de8f05", "#029e73" ), alpha=0.6)
venn3_circles(subsets=(virsorter_contigs, virsorter2_contigs,dvf_contigs), linewidth=0.5)  
plt.title('Venn Diagram of commonalities and differencies between pipelines results',  size = 20, pad=30)
plt.show()
No description has been provided for this image

Vien diagram for each sample seaprately¶

In [25]:
df_virsorter['tool'] = 'VirSorter'
df_virsorter2['tool'] = 'VirSorter2'
df_deepvirfinder['tool'] = 'DVF'

combined_df = pd.concat([df_virsorter, df_virsorter2, df_deepvirfinder])

# Get the list of unique samples
sample_names = combined_df['Sample_name'].unique()

# Function to plot Venn diagram for each sample
def plot_venn_for_sample(sample):
    sample_data = combined_df[combined_df['Sample_name'] == sample]
    
    contig_ids_1 = set(sample_data[sample_data['tool'] == 'VirSorter']['Contig_id'])
    contig_ids_2 = set(sample_data[sample_data['tool'] == 'VirSorter2']['Contig_id'])
    contig_ids_3 = set(sample_data[sample_data['tool'] == 'DVF']['Contig_id'])
    
    plt.figure(figsize=(10, 7))
    venn3([contig_ids_1, contig_ids_2, contig_ids_3], ('VirSorter', 'VirSorter2', 'DVF'), set_colors=("#0173b2", "#de8f05", "#029e73" ))
    plt.title(f'Venn Diagram for Sample {sample}')
    plt.show()

# Plot Venn diagram for each unique sample
for sample in sample_names:
    plot_venn_for_sample(sample)
No description has been provided for this image
C:\Users\Olga\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\matplotlib_venn\layout\venn3\pairwise.py:169: UserWarning: Bad circle positioning.
  warnings.warn("Bad circle positioning.")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
C:\Users\Olga\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\matplotlib_venn\layout\venn3\pairwise.py:103: UserWarning: Circle A has zero area.
  warnings.warn("Circle A has zero area.")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
C:\Users\Olga\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\matplotlib_venn\layout\venn3\pairwise.py:107: UserWarning: Circle B has zero area.
  warnings.warn("Circle B has zero area.")
No description has been provided for this image
No description has been provided for this image

heatmap¶

Load number of all contigs

Take Sample name and AMR value

In [26]:
df_amr = pd.read_csv("../metadata.csv", sep=",")
df_amr = df_amr[["Sample_name","Sum_RPKM"]].copy()
In [ ]:
 

Load data. As DVF saves the result for each contig, its dataframe was used to retrieve all contigs name form the dataset

In [27]:
all_contigs = pd.concat([pd.read_csv(fp, sep="\t").assign(Sample_name=os.path.basename(os.path.dirname(fp))) for fp in deepvirfinder_list])
sample_name_dvf = all_contigs.pop("Sample_name")
all_contigs.insert(0,"Sample_name", sample_name_dvf)
all_contigs = all_contigs.rename(columns={'name': 'Contig_id'})
all_contigs['Contig_id'] = all_contigs['Contig_id'].str.replace('.','_')
all_contigs['contig_id_all'] = all_contigs['Contig_id']
all_contigs = all_contigs.loc[all_contigs['len'] >= 1000]
all_contigs
Out[27]:
Sample_name Contig_id len score pvalue contig_id_all
0 BER_0XMTAHIFY NODE_25_length_283529_cov_12_214824 283529 3.357433e-01 0.111054 NODE_25_length_283529_cov_12_214824
1 BER_0XMTAHIFY NODE_22_length_301175_cov_14_146377 301175 5.025283e-01 0.060258 NODE_22_length_301175_cov_14_146377
2 BER_0XMTAHIFY NODE_24_length_291638_cov_77_923847 291638 4.616548e-01 0.080520 NODE_24_length_291638_cov_77_923847
3 BER_0XMTAHIFY NODE_23_length_292037_cov_32_250947 292037 6.629067e-01 0.036124 NODE_23_length_292037_cov_32_250947
4 BER_0XMTAHIFY NODE_21_length_307159_cov_86_194820 307159 6.482564e-01 0.037257 NODE_21_length_307159_cov_86_194820
... ... ... ... ... ... ...
929 Unknown_YSOUSNDN9 NODE_931_length_1001_cov_1_978858 1001 1.618433e-01 0.160208 NODE_931_length_1001_cov_1_978858
930 Unknown_YSOUSNDN9 NODE_910_length_1007_cov_2_834034 1007 6.921884e-01 0.033783 NODE_910_length_1007_cov_2_834034
931 Unknown_YSOUSNDN9 NODE_932_length_1000_cov_2_369312 1000 8.186931e-10 0.985403 NODE_932_length_1000_cov_2_369312
932 Unknown_YSOUSNDN9 NODE_929_length_1001_cov_2_540169 1001 2.468593e-03 0.444520 NODE_929_length_1001_cov_2_540169
933 Unknown_YSOUSNDN9 NODE_928_length_1001_cov_2_651163 1001 5.650779e-01 0.046359 NODE_928_length_1001_cov_2_651163

1849893 rows × 6 columns

In [28]:
all_counted = all_contigs.groupby(["Sample_name"]).count()
all_counted = all_counted[["contig_id_all"]]
all_counted.reset_index(inplace=True)
all_counted 
Out[28]:
Sample_name contig_id_all
0 BER_0XMTAHIFY 25272
1 BER_1FZ1M3LBT 73
2 BER_5UIO53M4H 15502
3 BER_62VRQ43DO 2175
4 BER_6LAJ3M790 675
... ... ...
118 TOK_EZO4LULQ4 13595
119 TOK_N2P2DPKIY 14386
120 TOK_SM080U360 39740
121 TOK_XV92PXO2D 28325
122 Unknown_YSOUSNDN9 934

123 rows × 2 columns

load initial df

In [29]:
deepvirfinder_counted
df_virsorter_counted
df_virsorter2_counted
Out[29]:
Sample_name Contig_id_VirSorter2
0 BER_0XMTAHIFY 1176
1 BER_1FZ1M3LBT 2
2 BER_5UIO53M4H 487
3 BER_62VRQ43DO 110
4 BER_6LAJ3M790 18
... ... ...
116 TOK_EZO4LULQ4 772
117 TOK_N2P2DPKIY 757
118 TOK_SM080U360 318
119 TOK_XV92PXO2D 1394
120 Unknown_YSOUSNDN9 33

121 rows × 2 columns

merge all_contigs column¶

Merge all contigs column and sort it based on Sum_RPKM values

In [30]:
deepvirfinder_rpkm = deepvirfinder_counted.merge(all_counted, on="Sample_name", how="outer").merge(df_amr, on="Sample_name").sort_values("Sum_RPKM", ascending= False).fillna(0)
virsorter_rpkm = df_virsorter_counted.merge(all_counted, on="Sample_name", how="outer").merge(df_amr, on="Sample_name").sort_values("Sum_RPKM", ascending= False).fillna(0)
virsorter2_rpkm = df_virsorter2_counted.merge(all_counted, on="Sample_name", how="outer").merge(df_amr, on="Sample_name").sort_values("Sum_RPKM", ascending= False).fillna(0)
In [31]:
deepvirfinder_rpkm["DeepVirFinder_density"] = deepvirfinder_rpkm["Contig_id_DVF"]/deepvirfinder_rpkm["contig_id_all"]
virsorter_rpkm["VirSorter_density"] = virsorter_rpkm["Contig_id_VirSorter"]/virsorter_rpkm["contig_id_all"]
virsorter2_rpkm["VirSorter2_density"] = virsorter2_rpkm["Contig_id_VirSorter2"]/virsorter2_rpkm["contig_id_all"]
In [32]:
virsorter2_rpkm_indexed = virsorter2_rpkm.set_index(virsorter2_rpkm["Sample_name"])
virsorter_rpkm_indexed = virsorter_rpkm.set_index(virsorter_rpkm["Sample_name"])
deepvirfinder_rpkm_indexed = deepvirfinder_rpkm.set_index(deepvirfinder_rpkm["Sample_name"])
In [33]:
deepvirfinder_rpkm_indexed
Out[33]:
Sample_name Contig_id_DVF contig_id_all Sum_RPKM DeepVirFinder_density
Sample_name
ILR_U6M78A4O4 ILR_U6M78A4O4 302 15884 1663.135815 0.019013
Other_B8T9MC9HX Other_B8T9MC9HX 11 39 1251.715658 0.282051
ILR_5O6P46B1O ILR_5O6P46B1O 1520 54313 1251.715658 0.027986
NYC_0OSAG4JMN NYC_0OSAG4JMN 326 13305 1164.001298 0.024502
ILR_YGMN39LS1 ILR_YGMN39LS1 806 33652 790.184125 0.023951
... ... ... ... ... ...
Other_2MFW9YKGQ Other_2MFW9YKGQ 7 81 2.594936 0.086420
Other_RGFMTG8DA Other_RGFMTG8DA 52 274 2.432254 0.189781
DOH_Z97N7FJ2K DOH_Z97N7FJ2K 18 212 1.722112 0.084906
BER_6LAJ3M790 BER_6LAJ3M790 45 675 1.530793 0.066667
Other_34XFCIUBK Other_34XFCIUBK 31 495 1.337777 0.062626

123 rows × 5 columns

In [34]:
df_original_density = virsorter_rpkm_indexed[["VirSorter_density"]].merge(virsorter2_rpkm_indexed[["VirSorter2_density"]], on= "Sample_name").merge(deepvirfinder_rpkm_indexed[["DeepVirFinder_density"]], on = "Sample_name")
In [35]:
df_original_density
Out[35]:
VirSorter_density VirSorter2_density DeepVirFinder_density
Sample_name
ILR_U6M78A4O4 0.012465 0.050932 0.019013
Other_B8T9MC9HX 0.000000 0.358974 0.282051
ILR_5O6P46B1O 0.006812 0.056837 0.027986
NYC_0OSAG4JMN 0.008568 0.057121 0.024502
ILR_YGMN39LS1 0.011946 0.058243 0.023951
... ... ... ...
Other_2MFW9YKGQ 0.000000 0.024691 0.086420
Other_RGFMTG8DA 0.003650 0.051095 0.189781
DOH_Z97N7FJ2K 0.018868 0.051887 0.084906
BER_6LAJ3M790 0.002963 0.026667 0.066667
Other_34XFCIUBK 0.000000 0.024242 0.062626

123 rows × 3 columns

In [36]:
cb = sns.color_palette("viridis", 2000)
In [37]:
fig2, axs2 = plt.subplots(1, 1, figsize = (40,40))
plt.xticks(size = 50)
axs2.set_title("Viral density per sample", size = 60, pad=30)
p2 = sns.heatmap(df_original_density, linewidth=.5, cmap = cb, vmax=0.15)
p2.set_ylabel("Sample", size=40)
p2
Out[37]:
<Axes: title={'center': 'Viral density per sample'}, ylabel='Sample'>
No description has been provided for this image